In [2]:
from sklearn import  metrics 
from sklearn.metrics import confusion_matrix
from sklearn import cross_validation
import matplotlib.pylab as plt

import pandas as pd
import numpy as np
churn= pd.read_csv("C:/Users/nawres.jguirim.stg/presentationpython/dataSet/final.csv", sep=';', encoding = "ISO-8859-1")
y = churn['Flag_inactivite']
categoric=churn.select_dtypes(include=['object'])
drop=[ 'subscriber_activation_date', 'period_end','period_start']
categoric  = categoric.drop(drop,axis=1)
num=churn.select_dtypes(include=['int64'])
drop2=['subscriber_id', 'Flag_inactivite']
num=num.drop(drop2,axis=1)
col_categoric_dummies = pd.get_dummies( categoric )
X = pd.concat([num, col_categoric_dummies],axis=1)
from sklearn.model_selection import train_test_split
C:\Users\nawres.jguirim.stg\AppData\Local\Continuum\anaconda3\lib\site-packages\sklearn\cross_validation.py:41: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)
In [227]:
############################################################
#################### random forest ####################
############################################################
In [3]:
Rank= [True,True,True,True,False,False,False,False,True,True,True,True
        ,True,False,True,True,True,True,True,True,True,True,True,True
        ,True,True,True,True,True,True,True,True,True,True,True,True
        ,True,True,True,True,True,True,True,True,True,True,True,False
        ,True,True,True,True,False,True,True,True,True,True,True,False
        ,False,True,True,True,True,True,True,True,True,True,True,False
        ,False,True,True,True,False,True,True,False,False,False,False,False
        ,False,False,False,False,False,False,False,True,False,False,False,False
        ,False,False,False,False,False,False,False,False,False,False,False,False
        ,False,False,False,False,False,True,False,False,False,False,False,False
        ,False,False,False,False,False,False,False,True,False,False,False,False
        ,False,False,False,False,False,False,False,False,False]



data={ 'Features':  X.columns ,'Rank': Rank}
rank=pd.DataFrame( data)
li=rank.loc[rank.Rank ]
X_train, X_test, y_train, y_test = train_test_split(X[li.Features], y, test_size =0.3)

import time
start_time = time.time()
from sklearn.ensemble import RandomForestClassifier
random_forest = RandomForestClassifier( n_estimators=300, max_depth=30, 
                                       min_samples_split=50,min_samples_leaf=25)
#n_estimators=the number of trees in the forest 
random_forest.fit(X_train, y_train)

print("--- %s seconds ---" % (time.time() - start_time))
--- 48.5939998626709 seconds ---
In [4]:
# Predict and get the accuracy score
from sklearn import metrics
predict_y_random_forest = random_forest.predict(X_test)
model_score_random_forest = random_forest.score(X_test, y_test)


preds_random_forest = random_forest.predict_proba(X_test)[:,1]
fp_random_forest, tp_random_forest, threshold_random_forest = metrics.roc_curve(y_test, preds_random_forest)
AUC_random_forest = metrics.auc(fp_random_forest, tp_random_forest)

#K-folds cross validaion
cv_score_random_forest = cross_validation.cross_val_score(random_forest, X_train , y_train, cv=3, scoring='roc_auc')   
In [5]:
cv_mean_random_forest=np.mean(cv_score_random_forest) 
#Print model report:
print ("\nModel Report")
print ("Accuracy : %.3g" % model_score_random_forest )
print ("AUC Score (Test): %0.3f" % AUC_random_forest )
print ("CV Score : Mean - %.4g | Std - %.4g | Min - %.4g | Max - %.4g" % (np.mean(cv_score_random_forest),
                                                                          np.std(cv_score_random_forest),
                                                                          np.min(cv_score_random_forest),
                                                                          np.max(cv_score_random_forest)))    
import math 
y_hat_random_forest= random_forest.predict(X_test)
RSS_random_forest = ((y_hat_random_forest - y_test) ** 2).sum()
k=X.shape[1]
AIC_random_forest= 2*k - 2*math.log(RSS_random_forest )
print( "Akaike information criterion: " + str(AIC_random_forest ))
print('somme  erreur carré ' +str(RSS_random_forest))

confusion_matrix_random_forest = metrics.confusion_matrix(y_test, predict_y_random_forest)
TN= confusion_matrix_random_forest[0, 0]                
FP=confusion_matrix_random_forest[0, 1]
FN=confusion_matrix_random_forest[1, 0]
TP=confusion_matrix_random_forest[1, 1] 
#false negative rate error type I 
fnr_random_forest=FN/(FN+ TP)
#False negaive rate error type II
fpr_random_forest=FP/(FP+TN)

print ("Erreur type I: FN false negative rate: %.3g" % fnr_random_forest )   
print ("Erreur type II:  FP False Positive rate: %.3g" % fpr_random_forest )   


print ("\n ")
print ("Confusion Matrix: ")

print ("          Predicted")
print ("         |  0  |  1  |")
print ("         |-----|-----|")
print ("       0 | %3d | %3d |" % (confusion_matrix_random_forest[0, 0],
                                   confusion_matrix_random_forest[0, 1]))
print ("Actual   |-----|-----|")
print ("       1 | %3d | %3d |" % (confusion_matrix_random_forest[1, 0],
                                   confusion_matrix_random_forest[1, 1]))
print ("         |-----|-----|")
print ("\n ")
from sklearn.metrics import classification_report
print(classification_report(y_test, 
                            predict_y_random_forest, 
                            digits=3))
Model Report
Accuracy : 0.747
AUC Score (Test): 0.826
CV Score : Mean - 0.8273 | Std - 0.00177 | Min - 0.8249 | Max - 0.8289
Akaike information criterion: 265.2640062298918
somme  erreur carré 4307
Erreur type I: FN false negative rate: 0.29
Erreur type II:  FP False Positive rate: 0.216

 
Confusion Matrix: 
          Predicted
         |  0  |  1  |
         |-----|-----|
       0 | 6636 | 1830 |
Actual   |-----|-----|
       1 | 2477 | 6056 |
         |-----|-----|

 
             precision    recall  f1-score   support

          0      0.728     0.784     0.755      8466
          1      0.768     0.710     0.738      8533

avg / total      0.748     0.747     0.746     16999

In [6]:
fig = plt.figure(figsize=(10,4), dpi=1600) 
feat_imp_random_forest = pd.Series(random_forest.feature_importances_,li.Features)[1:70].sort_values(ascending=False)
feat_imp_random_forest.plot(kind='bar', title='Feature Importances')
plt.ylabel('Feature Importance Score')
Out[6]:
Text(0,0.5,'Feature Importance Score')
In [12]:
# method I: plt
import matplotlib.pyplot as plt
plt.title('Receiver Operating Characteristic')
plt.plot(fp_random_forest, tp_random_forest, 'b', label = 'AUC = %0.3f' % AUC_random_forest)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()
In [ ]:
##################  MLP  ##################
###########################################
In [7]:
import time
start_time = time.time()

from sklearn.decomposition import PCA
pca= PCA(n_components=37, whiten= True )
pca.fit(X)
X_pca = pca.transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size =0.3 )
    

from sklearn.neural_network import MLPClassifier
mlp = MLPClassifier(hidden_layer_sizes=(30),max_iter=500)
mlp.fit(X_train,y_train)
MLPClassifier(activation='relu', alpha=0.001, batch_size='auto', beta_1=0.9,
       beta_2=0.899, early_stopping=True , epsilon=1e-07,
       hidden_layer_sizes=(30), learning_rate='invscaling',
       learning_rate_init=0.1, max_iter=500, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.01, validation_fraction=0.1,
       verbose=False, warm_start=False)
print("--- %s seconds ---" % (time.time() - start_time))
--- 19.31280016899109 seconds ---
In [8]:
# Predict and get the accuracy score
from sklearn import metrics
predict_y_mlp = mlp.predict(X_test)
model_score_mlp =  mlp.score(X_test, y_test)

# calculate the fpr and tpr for all thresholds of the classification
probs_mlp = mlp.predict_proba(X_test)
preds_mlp = probs_mlp[:,1]
fp_mlp, tp_mlp, threshold_mlp = metrics.roc_curve(y_test, preds_mlp)
AUC_mlp = metrics.auc(fp_mlp, tp_mlp)

#K-folds cross validaion
cv_score_mlp = cross_validation.cross_val_score(mlp, X_train , y_train, cv=5, scoring='roc_auc')   
In [9]:
cv_mean_mlp=np.mean(cv_score_mlp)
#Print model report:
print ("\nModel Report")
print ("Accuracy : %.3g" % model_score_mlp )
print ("AUC Score (Test): %0.3f" % AUC_mlp )
print ("CV Score : Mean - %.4g | Std - %.4g | Min - %.4g | Max - %.4g" % (np.mean(cv_score_mlp),
                                                                          np.std(cv_score_mlp),
                                                                          np.min(cv_score_mlp),
                                                                          np.max(cv_score_mlp)))    
confusion_matrix_mlp = metrics.confusion_matrix(y_test, predict_y_mlp )

TN= confusion_matrix_mlp[0, 0]                
FP=confusion_matrix_mlp[0, 1]
FN=confusion_matrix_mlp[1, 0]
TP=confusion_matrix_mlp[1, 1] 
#false negative rate error type I 
fnr_mlp=FN/(FN+ TP)
#False negaive rate error type II
fpr_mlp=FP/(FP+TN)

print ("Erreur type I: FP False Positive rate: %.3g" % fnr_mlp )   
print ("Erreur type II: FP false negative rate: %.3g" % fpr_mlp )   
print ("\n ")
print ("Confusion Matrix: ")

print ("          Predicted")
print ("         |  0   |  1  |")
print ("         |------|------|")
print ("       0 | %3d  | %3d |" % (confusion_matrix_mlp[0, 0],
                                   confusion_matrix_mlp[0, 1]))
print ("Actual   |------|------|")
print ("       1 | %3d | %3d |" % (confusion_matrix_mlp[1, 0],
                                   confusion_matrix_mlp[1, 1]))
print ("         |------|------|")
print ("\n ")
import math 
y_hat_mlp= mlp.predict(X_test)
RSS_mlp = ((y_hat_mlp - y_test) ** 2).sum()
k=X.shape[1]
AIC_mlp= 2*k - 2*math.log(RSS_mlp)
print( "Akaike information criterion: " + str(AIC_mlp))
print ("\n ")
from sklearn.metrics import classification_report
print(classification_report(y_test, 
                            predict_y_mlp, 
                            digits=3))


predictionTrain = mlp.predict(X_train)
print("train !" + str(classification_report(y_train,predictionTrain , digits=3)))
Model Report
Accuracy : 0.727
AUC Score (Test): 0.797
CV Score : Mean - 0.7978 | Std - 0.003775 | Min - 0.7919 | Max - 0.8026
Erreur type I: FP False Positive rate: 0.297
Erreur type II: FP false negative rate: 0.249

 
Confusion Matrix: 
          Predicted
         |  0   |  1  |
         |------|------|
       0 | 6376  | 2112 |
Actual   |------|------|
       1 | 2531 | 5980 |
         |------|------|

 
Akaike information criterion: 265.1137680239602

 
             precision    recall  f1-score   support

          0      0.716     0.751     0.733      8488
          1      0.739     0.703     0.720      8511

avg / total      0.727     0.727     0.727     16999

train !             precision    recall  f1-score   support

          0      0.738     0.763     0.750     19843
          1      0.754     0.728     0.741     19820

avg / total      0.746     0.746     0.746     39663

In [ ]:
fig = plt.figure(figsize=(15,4), dpi=1600) 
feat_imp_gbm= pd.Series(gbm.feature_importances_, selected_features )[1:70].sort_values(ascending=False)
feat_imp_gbm.plot(kind='bar', title='Feature Importances')
plt.ylabel('Feature Importance Score')
In [8]:
# method I: plt
import matplotlib.pyplot as plt
plt.title('Receiver Operating Characteristic')
plt.plot(fp_mlp, tp_mlp, 'b', label = 'AUC = %0.3f' % AUC_mlp)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()
In [11]:
#######################    SVM    #####################
#######################################################
In [3]:
##################### regression #####################
######################################################
In [12]:
import time
start_time = time.time()
def correlation(dataset, threshold):
    col_corr = set() # Set of all the names of deleted columns
    corr_matrix = dataset.corr()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if abs(corr_matrix.iloc[i, j]) >= threshold:
                colname = corr_matrix.columns[i] # getting the name of column
                col_corr.add(colname)
                if colname in dataset.columns:
                    del dataset[colname] # deleting the column from the dataset

    return (dataset)
X_corr= pd.DataFrame(correlation( X, .7))

Rank= [False,False,False,False,False,False,False,False,False,False,True,False
,False,False,False,False,False,False,False,False,True,True,True,True
,True,True,True,True,True,True,True,True,True,True,True,True
,False,False,False,False,False,False,False,True,False,True,False,True
,True,False,True,True,True,False,True,False,False,False,True,True
,True,False,True,False,True,True,True,True,True,True]




data={ 'Features':  X.columns ,'Rank': Rank}
rank=pd.DataFrame( data)
li=rank.loc[rank.Rank ]
X_train, X_test, y_train, y_test = train_test_split(X_corr[li.Features], y, test_size =0.3)

from sklearn.linear_model import LogisticRegression
from sklearn import metrics
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
print("--- %s seconds ---" % (time.time() - start_time))
--- 2.2619855403900146 seconds ---
In [13]:
# Predict and get the accuracy score
from sklearn import metrics
predict_y_logreg =   logreg.predict(X_test)
model_score_logreg =    logreg.score(X_test, y_test)

# calculate the fpr and tpr for all thresholds of the classification
probs_logreg = logreg.predict_proba(X_test)
preds_logreg = probs_logreg[:,1]
fp_logreg, tp_logreg, threshold_logreg = metrics.roc_curve(y_test, preds_logreg)
AUC_logreg = metrics.auc(fp_logreg, tp_logreg)

#K-folds cross validaion
cv_score_logreg = cross_validation.cross_val_score(  logreg, X_train , y_train, cv=3, scoring='roc_auc')   
In [14]:
cv_mean_logreg=np.mean(cv_score_logreg)
#Print model report:
print ("\nModel Report")
print ("Accuracy : %.3g" % model_score_logreg )
print ("AUC Score (Test): %0.3f" % AUC_logreg )
print ("CV Score : Mean - %.4g | Std - %.4g | Min - %.4g | Max - %.4g" % (np.mean(cv_score_logreg),
                                                                          np.std(cv_score_logreg),
                                                                          np.min(cv_score_logreg),
                                                                          np.max(cv_score_logreg)))    
import math 
y_hat_logreg= logreg.predict(X_test)
RSS_logreg = ((y_hat_logreg - y_test) ** 2).sum()
k=X.shape[1]
AIC_logreg= 2*k - 2*math.log(RSS_logreg )
print( "Akaike information criterion: " + str(AIC_logreg ))
print('somme  erreur carré ' +str(RSS_logreg))

confusion_matrix_logreg = metrics.confusion_matrix(y_test, predict_y_logreg)
TN= confusion_matrix_logreg[0, 0]                
FP=confusion_matrix_logreg[0, 1]
FN=confusion_matrix_logreg[1, 0]
TP=confusion_matrix_logreg[1, 1] 
#false negative rate error type I 
fnr_logreg=FN/(FN+ TP)
#False negaive rate error type II
fpr_logreg=FP/(FP+TN)

print ("Erreur type I: FP False Positive rate: %.3g" % fnr_logreg )   
print ("Erreur type II: FP false negative rate: %.3g" % fpr_logreg )   


print ("\n ")
print ("Confusion Matrix: ")

print ("          Predicted")
print ("         |  0   |  1   |")
print ("         |------|------|")
print ("       0 | %3d | %3d |" % (confusion_matrix_logreg[0, 0],
                                   confusion_matrix_logreg[0, 1]))
print ("Actual   |------|------|")
print ("       1 | %3d | %3d |" % (confusion_matrix_logreg[1, 0],
                                   confusion_matrix_logreg[1, 1]))
print ("         |------|------|")
print ("\n ")
from sklearn.metrics import classification_report
print(classification_report(y_test, 
                            predict_y_logreg, 
                            digits=3))
print("train ")
print(classification_report(y_train , 
                            logreg.predict(X_train), 
                            digits=3))
Model Report
Accuracy : 0.656
AUC Score (Test): 0.723
CV Score : Mean - 0.7218 | Std - 0.005108 | Min - 0.7165 | Max - 0.7287
Akaike information criterion: 122.65468541191939
somme  erreur carré 5841
Erreur type I: FP False Positive rate: 0.401
Erreur type II: FP false negative rate: 0.285

 
Confusion Matrix: 
          Predicted
         |  0   |  1   |
         |------|------|
       0 | 6037 | 2410 |
Actual   |------|------|
       1 | 3431 | 5121 |
         |------|------|

 
             precision    recall  f1-score   support

          0      0.638     0.715     0.674      8447
          1      0.680     0.599     0.637      8552

avg / total      0.659     0.656     0.655     16999

train 
             precision    recall  f1-score   support

          0      0.644     0.714     0.677     19884
          1      0.677     0.603     0.638     19779

avg / total      0.661     0.659     0.658     39663

In [24]:
#ROC Curve
plt.figure()
plt.plot(fp_logreg, tp_logreg, label='Logistic Regression (area = %0.3f)' % AUC_logreg)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show()
In [ ]:
################################Gradient Boosting ################################
###################################################################################
In [16]:
import time
start_time = time.time()

##### for reexecute the code ######
Rank= [True,True,False,True,True,True,False,False,True,True,True,True
,True,False,True,True,False,False,True,True,True,True,True,False
,True,False,True,True,True,True,True,True,True,True,True,True
,True,True,False,False,True,False,True,True,True,True,True,False
,False,False,False,True,True,True,True,True,True,True,True,False
,False,True,False,True,False,True,True,True,True,True,True,False
,True,False,True,True,True,True,True,False,False,False,False,False
,False,False,False,False,False,True,True,True,False,False,False,True
,True,True,False,False,False,False,False,False,True,False,False,False
,False,True,False,True,False,True,False,False,False,False,False,False
,True,False,False,False,False,False,False,False,False,True,False,False
,False,False,False,False,False,False,False,False,False]


X = pd.concat([num, col_categoric_dummies],axis=1)

data={ 'Features':  X.columns ,'Rank': Rank}
rank=pd.DataFrame( data)
li=rank.loc[rank.Rank ]
selected_features=li.Features
X_train, X_test, y_train, y_test = train_test_split(X[selected_features], y, test_size =0.3 ,  stratify=y)

import time
start_time = time.time()
from sklearn.ensemble import GradientBoostingClassifier  #GBM algorithm
gbm = GradientBoostingClassifier( n_estimators=600)
gbm.fit(X_train, y_train)
print("--- %s seconds ---" % (time.time() - start_time))
--- 168.20907497406006 seconds ---
In [17]:
# Predict and get the accuracy score
from sklearn import metrics
predict_y_gbm = gbm.predict(X_test)
model_score_gbm =  gbm.score(X_test, y_test)

# calculate the fpr and tpr for all thresholds of the classification
probs_gbm = gbm.predict_proba(X_test)
preds_gbm = probs_gbm[:,1]
fp_gbm, tp_gbm, threshold_gbm = metrics.roc_curve(y_test, preds_gbm)
AUC_gbm = metrics.auc(fp_gbm, tp_gbm)

#K-folds cross validaion
cv_score_gbm = cross_validation.cross_val_score(gbm, X_train , y_train, cv=3, scoring='roc_auc')   


cv_mean_gbm=np.mean(cv_score_gbm)
#Print model report:
print ("\nModel Report")
print ("Accuracy : %.3g" % model_score_gbm )
print ("AUC Score (Test): %0.3f" % AUC_gbm )
print ("CV Score : Mean - %.4g | Std - %.4g | Min - %.4g | Max - %.4g" % (np.mean(cv_score_gbm),
                                                                          np.std(cv_score_gbm),
                                                                          np.min(cv_score_gbm),
                                                                          np.max(cv_score_gbm)))    
confusion_matrix_gbm = metrics.confusion_matrix(y_test, predict_y_gbm)
import math 
RSS_gbm = ((predict_y_gbm - y_test) ** 2).sum()
k=X.shape[1]
AIC_gbm= 2*k - 2*math.log(RSS_gbm )
print( "Akaike information criterion: " + str(AIC_gbm ))
print('somme  erreur carré ' +str(RSS_gbm))

TN= confusion_matrix_gbm[0, 0]                
FP=confusion_matrix_gbm[0, 1]
FN=confusion_matrix_gbm[1, 0]
TP=confusion_matrix_gbm[1, 1] 
#false negative rate error type I 
fnr_gbm=FN/(FN+ TP)
#False negaive rate error type II
fpr_gbm=FP/(FP+TN)

print ("Erreur type I: FP False Positive rate: %.3g" % fnr_gbm )   
print ("Erreur type II: FP false negative rate: %.3g" % fpr_gbm )   



print ("\n ")
print ("Confusion Matrix: ")

print ("          Predicted")
print ("         |  0  |  1  |")
print ("         |-----|-----|")
print ("       0 | %3d | %3d |" % (confusion_matrix_gbm[0, 0],
                                   confusion_matrix_gbm[0, 1]))
print ("Actual   |-----|-----|")
print ("       1 | %3d | %3d |" % (confusion_matrix_gbm[1, 0],
                                   confusion_matrix_gbm[1, 1]))
print ("         |-----|-----|")
print ("\n ")
from sklearn.metrics import classification_report
print(classification_report(y_test, 
                            predict_y_gbm, 
                            digits=3))
print("Train !")
print(classification_report(y_train, 
                             gbm.predict(X_train), 
                            digits=3))
Model Report
Accuracy : 0.774
AUC Score (Test): 0.854
CV Score : Mean - 0.8498 | Std - 0.002423 | Min - 0.8464 | Max - 0.8518
Akaike information criterion: 265.4883431454363
somme  erreur carré 3850
Erreur type I: FP False Positive rate: 0.246
Erreur type II: FP false negative rate: 0.207

 
Confusion Matrix: 
          Predicted
         |  0  |  1  |
         |-----|-----|
       0 | 6738 | 1762 |
Actual   |-----|-----|
       1 | 2088 | 6411 |
         |-----|-----|

 
             precision    recall  f1-score   support

          0      0.763     0.793     0.778      8500
          1      0.784     0.754     0.769      8499

avg / total      0.774     0.774     0.773     16999

Train !
             precision    recall  f1-score   support

          0      0.791     0.826     0.808     19831
          1      0.818     0.781     0.799     19832

avg / total      0.804     0.804     0.803     39663

In [71]:
import matplotlib.pyplot as plt
plt.title('Receiver Operating Characteristic')
plt.plot(fp_gbm, tp_gbm, 'b', label = 'AUC = %0.3f' % AUC_gbm)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()
In [19]:
fig = plt.figure(figsize=(10,4), dpi=1600) 
feat_imp_gbm= pd.Series(gbm.feature_importances_,selected_features )[1:55].sort_values(ascending=False)
feat_imp_gbm.plot(kind='bar', title='Feature Importances')
plt.ylabel('Feature Importance Score')
Out[19]:
Text(0,0.5,'Feature Importance Score')
In [99]:
################ Bays ################
######################################
In [20]:
start_time = time.time()
X_train, X_test, y_train, y_test = train_test_split(X_corr, y, test_size=0.3)
from  sklearn.naive_bayes import BernoulliNB
#Create a Gaussian Classifier
model = BernoulliNB()
# Train the model using the training sets 
bays= model.fit(X_train, y_train)
print("--- %s seconds ---" % (time.time() - start_time))
--- 0.5771849155426025 seconds ---
In [21]:
# Predict and get the accuracy score
from sklearn import metrics
predict_y_bays = bays.predict(X_test)
model_score_bays =  bays.score(X_test, y_test)

# calculate the fpr and tpr for all thresholds of the classification
probs_bays = bays.predict_proba(X_test)
preds_bays = probs_bays[:,1]
fp_bays, tp_bays, threshold_bays = metrics.roc_curve(y_test, preds_bays)
AUC_bays = metrics.auc(fp_bays, tp_bays)

#K-folds cross validaion
cv_score_bays = cross_validation.cross_val_score(bays, X_train , y_train, cv=3, scoring='roc_auc')   
In [231]:
bays=None 
AUC_bays=None
fp_random_bays=None
tp_random_bays=None
threshold_random_bays =None
preds_random_bays=None  
model_score_bays=None 
predict_y_bays=None 
forest_bays=None 
RSS_bays =None 
cv_score_bays =None 
y_hat_bays=None  
gc.collect()
Out[231]:
0
In [22]:
cv_mean_bays =np.mean(cv_score_bays)
#Print model report:
print ("\nModel Report")
print ("Accuracy : %.3g" % model_score_bays )
print ("AUC Score (Test): %0.3f" % AUC_bays )
print ("CV Score : Mean - %.4g | Std - %.4g | Min - %.4g | Max - %.4g" % (np.mean(cv_score_bays),
                                                                          np.std(cv_score_bays),
                                                                          np.min(cv_score_bays),
                                                                          np.max(cv_score_bays)))    
confusion_matrix_bays = metrics.confusion_matrix(y_test, predict_y_bays)
import math 
y_hat_bays= bays.predict(X_test)
RSS_bays = ((y_hat_bays - y_test) ** 2).sum()
k=X.shape[1]
AIC_bays= 2*k - 2*math.log(RSS_bays )
print( "Akaike information criterion: " + str(AIC_bays ))
print('somme  erreur carré ' +str(RSS_bays))

TN= confusion_matrix_bays[0, 0]                
FP=confusion_matrix_bays[0, 1]
FN=confusion_matrix_bays[1, 0]
TP=confusion_matrix_bays[1, 1] 
#false negative rate error type I 
fnr_bays=FN/(FN+ TP)
#False negaive rate error type II
fpr_bays=FP/(FP+TN)

print ("Erreur type I: FP False Positive rate: %.3g" % fnr_bays )   
print ("Erreur type II: FP false negative rate: %.3g" % fpr_bays )   



print ("\n ")
print ("Confusion Matrix: ")

print ("          Predicted")
print ("         |  0  |  1  |")
print ("         |-----|-----|")
print ("       0 | %3d | %3d |" % (confusion_matrix_bays[0, 0],
                                   confusion_matrix_bays[0, 1]))
print ("Actual   |-----|-----|")
print ("       1 | %3d | %3d |" % (confusion_matrix_bays[1, 0],
                                   confusion_matrix_bays[1, 1]))
print ("         |-----|-----|")
print ("\n ")
from sklearn.metrics import classification_report
print(classification_report(y_test, 
                            predict_y_bays))
Model Report
Accuracy : 0.664
AUC Score (Test): 0.726
CV Score : Mean - 0.7344 | Std - 0.001718 | Min - 0.7321 | Max - 0.7361
Akaike information criterion: 264.6965518312523
somme  erreur carré 5720
Erreur type I: FP False Positive rate: 0.408
Erreur type II: FP false negative rate: 0.267

 
Confusion Matrix: 
          Predicted
         |  0  |  1  |
         |-----|-----|
       0 | 6292 | 2287 |
Actual   |-----|-----|
       1 | 3433 | 4987 |
         |-----|-----|

 
             precision    recall  f1-score   support

          0       0.65      0.73      0.69      8579
          1       0.69      0.59      0.64      8420

avg / total       0.67      0.66      0.66     16999

In [81]:
# method I: plt
import matplotlib.pyplot as plt
plt.title('Receiver Operating Characteristic')
plt.plot(fp_bays, tp_bays, 'b', label = 'AUC = %0.3f' % AUC_bays)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 
          1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()
In [ ]:
################ Discriminante analysis ################
###################################################
In [45]:
################ arbre de decision ################
###################################################
In [47]:
Rank= [False,False,False,False,False,False,False,False,False,False,False,False
        ,False,False,True,True,True,True,True,True,True,True,True,True
        ,True,True,True,True,True,True,True,True,True,True,True,True
        ,True,True,True,True,True,True,True,True,True,True,True,True
        ,True,False,False,False,False,False,False,False,False,False,False,False
        ,False,False,False,False,False,False,False,False,False,False,False,False
        ,False,False,False,False,False,False,False,False,False,False,False,False
        ,False,False,False,False,False,False,False,False,False,False,False,False
        ,False,False,False,False,False,False,False,False,False,False,True,True
        ,True,True,True,True,True,True,True,True,True,True,True,True
        ,True,True,True,True,True,True,True,True,True,True,True,True
        ,True,True,True,True,True,True,True,True,True]


X = pd.concat([num, col_categoric_dummies],axis=1)

data={ 'Features':  X.columns ,'Rank': Rank}
rank=pd.DataFrame( data)
li=rank.loc[rank.Rank ]
X_train, X_test, y_train, y_test = train_test_split(X[li.Features], y, test_size =0.3)


import time 
start_time = time.time()
from sklearn import tree
DT= tree.DecisionTreeClassifier(  max_depth=7, min_samples_split=150,min_samples_leaf= 100)
DT.fit(X_train, y_train)
print("--- %s seconds ---" % (time.time() - start_time))
--- 0.655195951461792 seconds ---

import time start_time = time.time()

col_categoric= churn[['Gouvernorat','offer', 'device_type_name','Offer_nature_parent', 'Touriste', 'Business']] col_categoric_dummies = pd.get_dummies( col_categoric ) X = np.concatenate([col_numeric, col_categoric_dummies],axis=1)

Splitting X and y into training and testing sets

from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size =0.3 , stratify=y) from sklearn import tree DT=tree.DecisionTreeClassifier() DT.fit(X_train, y_train) print("--- %s seconds ---" % (time.time() - start_time))

In [48]:
# Predict and get the accuracy score
from sklearn import metrics
predict_y_DT = DT.predict(X_test)
model_score_DT =  DT.score(X_test, y_test)

# calculate the fpr and tpr for all thresholds of the classification
probs_DT = DT.predict_proba(X_test)
preds_DT = probs_DT[:,1]
fp_DT, tp_DT, threshold_DT = metrics.roc_curve(y_test, preds_DT)
AUC_DT = metrics.auc(fp_DT, tp_DT)

#K-folds cross validaion
cv_score_DT = cross_validation.cross_val_score(DT, X_train , y_train, cv=3, scoring='roc_auc')   
In [49]:
cv_mean_DT =np.mean(cv_score_DT)

#Print model report:
print ("\nModel Report")
print ("Accuracy : %.3g" % model_score_DT )
print ("AUC Score (Test): %0.3f" % AUC_DT )
print ("CV Score : Mean - %.4g | Std - %.4g | Min - %.4g | Max - %.4g" % (np.mean(cv_score_DT),
                                                                          np.std(cv_score_DT),
                                                                          np.min(cv_score_DT),
                                                                          np.max(cv_score_DT)))    
confusion_matrix_DT = metrics.confusion_matrix(y_test, predict_y_DT)
import math 
y_hat_DT= DT.predict(X_test)
RSS_DT = ((y_hat_DT - y_test) ** 2).sum()
k=X.shape[1]
AIC_DT= 2*k - 2*math.log(RSS_DT )
print( "Akaike information criterion: " + str(AIC_DT ))
print('somme  erreur carré ' +str(RSS_DT))


TN= confusion_matrix_DT[0, 0]                
FP=confusion_matrix_DT[0, 1]
FN=confusion_matrix_DT[1, 0]
TP=confusion_matrix_DT[1, 1] 
#false negative rate error type I 
fnr_DT=FN/(FN+ TP)
#False negaive rate error type II
fpr_DT=FP/(FP+TN)

print ("Erreur type I: FP False Positive rate: %.3g" % fnr_DT )   
print ("Erreur type II: FP false negative rate: %.3g" % fpr_DT )   


print ("\n ")
print ("Confusion Matrix: ")

print ("          Predicted")
print ("         |  0   |   1  |")
print ("         |------|------|")
print ("       0 | %3d | %3d |" % (confusion_matrix_DT[0, 0],
                                   confusion_matrix_DT[0, 1]))
print ("Actual   |------|------|")
print ("       1 | %3d | %3d |" % (confusion_matrix_DT[1, 0],
                                   confusion_matrix_DT[1, 1]))
print ("         |------|------|")
print ("\n ")
from sklearn.metrics import classification_report
print(classification_report(y_test, 
                           predict_y_DT, 
                            digits=3))
Model Report
Accuracy : 0.718
AUC Score (Test): 0.776
CV Score : Mean - 0.7744 | Std - 0.004368 | Min - 0.7685 | Max - 0.7788
Akaike information criterion: 265.05268162121496
somme  erreur carré 4787
Erreur type I: FP False Positive rate: 0.33
Erreur type II: FP false negative rate: 0.233

 
Confusion Matrix: 
          Predicted
         |  0   |   1  |
         |------|------|
       0 | 6483 | 1968 |
Actual   |------|------|
       1 | 2819 | 5729 |
         |------|------|

 
             precision    recall  f1-score   support

          0      0.697     0.767     0.730      8451
          1      0.744     0.670     0.705      8548

avg / total      0.721     0.718     0.718     16999

In [34]:
# method I: plt
import matplotlib.pyplot as plt
plt.title('Receiver Operating Characteristic')
plt.plot(fp_DT, tp_DT, 'b', label = 'AUC = %0.3f' % AUC_DT)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()
In [106]:
################################ adaptative boossting ################################
######################################################################################
In [26]:
Rank= [True,True,True,True,True,True,True,True,True,True,True,True
        ,True,True,True,True,True,True,True,True,True,True,True,True
        ,True,False,True,True,False,False,True,True,True,False,False,True
        ,False,True,True,True,False,False,False,True,True,False,False,False
        ,False,False,False,False,False,False,False,True,False,False,False,False
        ,False,False,False,False,False,False,True,False,False,False,False,False
        ,False,False,False,True,True,True,True,True,True,True,True,True
        ,True,True,True,True,False,False,True,False,False,False,True,True
        ,False,False,True,False,False,False,True,False,True,False,False,False
        ,False,True,True,True,False,True,False,False,False,False,False,False
        ,True,False,True,True,True,True,True,True,True,True,False,False
        ,False,False,False,False,False,False,False,False,False]



data={ 'Features':  X.columns ,'Rank': Rank}
rank=pd.DataFrame( data)
li=rank.loc[rank.Rank ]

import time
start_time = time.time()
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X[li.Features], y, test_size =0.3)
from sklearn.ensemble import AdaBoostClassifier
ada = AdaBoostClassifier( n_estimators=400,algorithm='SAMME.R', learning_rate=.6)
ada.fit(X_train, y_train)

print("--- %s seconds ---" % (time.time() - start_time))
--- 48.79554891586304 seconds ---
In [27]:
# Predict and get the accuracy score
from sklearn import metrics
predict_y_ada = ada.predict(X_test)
model_score_ada =  ada.score(X_test, y_test)

# calculate the fpr and tpr for all thresholds of the classification
probs_ada = ada.predict_proba(X_test)
preds_ada = probs_ada[:,1]
fp_ada, tp_ada, threshold_ada = metrics.roc_curve(y_test, preds_ada)
AUC_ada = metrics.auc(fp_ada, tp_ada)

#K-folds cross validaion
cv_score_ada = cross_validation.cross_val_score(ada, X_train , y_train, cv=3, scoring='roc_auc')   
In [28]:
cv_mean_ada =np.mean(cv_score_ada)

#Print model report:
print ("\nModel Report")
print ("Accuracy : %.3g" % model_score_ada )
print ("AUC Score (Test): %0.3f" % AUC_ada )
print ("CV Score : Mean - %.4g | Std - %.4g | Min - %.4g | Max - %.4g" % (np.mean(cv_score_ada),
                                                                          np.std(cv_score_ada),
                                                                          np.min(cv_score_ada),
                                                                          np.max(cv_score_ada)))    
import math 

RSS_ada = (( predict_y_ada - y_test) ** 2).sum()
k=X.shape[1]
AIC_ada= 2*k - 2*math.log(RSS_ada )
print( "Akaike information criterion: " + str(AIC_ada ))
print('somme  erreur carré ' +str(RSS_ada))

confusion_matrix_ada = metrics.confusion_matrix(y_test, predict_y_ada)
TN= confusion_matrix_ada[0, 0]                
FP=confusion_matrix_ada[0, 1]
FN=confusion_matrix_ada[1, 0]
TP=confusion_matrix_ada[1, 1] 
#false negative rate error type I 
fnr_ada=FN/(FN+ TP)
#False negaive rate error type II
fpr_ada=FP/(FP+TN)

print ("Erreur type I: FP False Positive rate: %.3g" % fnr_ada )   
print ("Erreur type II: FP false negative rate: %.3g" % fpr_ada )   


print ("\n ")
print ("Confusion Matrix: ")

print ("          Predicted")
print ("         |  0  |  1    |")
print ("         |------|------|")
print ("       0 | %3d | %3d |" % (confusion_matrix_ada[0, 0],
                                   confusion_matrix_ada[0, 1]))
print ("Actual   |------|------|")
print ("       1 | %3d | %3d |" % (confusion_matrix_ada[1, 0],
                                   confusion_matrix_ada[1, 1]))
print ("         |------|------|")
print ("\n ")
from sklearn.metrics import classification_report
print(classification_report(y_test, 
                            predict_y_ada, 
                            digits=3))
print('train ')
print(classification_report(y_train, 
                        ada.predict(X_train), 
                            digits=3))
Model Report
Accuracy : 0.75
AUC Score (Test): 0.828
CV Score : Mean - 0.8315 | Std - 0.002348 | Min - 0.8283 | Max - 0.834
Akaike information criterion: 265.29300580250913
somme  erreur carré 4245
Erreur type I: FP False Positive rate: 0.272
Erreur type II: FP false negative rate: 0.227

 
Confusion Matrix: 
          Predicted
         |  0  |  1    |
         |------|------|
       0 | 6584 | 1935 |
Actual   |------|------|
       1 | 2310 | 6170 |
         |------|------|

 
             precision    recall  f1-score   support

          0      0.740     0.773     0.756      8519
          1      0.761     0.728     0.744      8480

avg / total      0.751     0.750     0.750     16999

train 
             precision    recall  f1-score   support

          0      0.746     0.786     0.766     19812
          1      0.774     0.733     0.753     19851

avg / total      0.760     0.760     0.759     39663

In [46]:
# method I: plt
import matplotlib.pyplot as plt
plt.title('Receiver Operating Characteristic')
plt.plot(fp_ada, tp_ada, 'b', label = 'AUC = %0.3f' % AUC_ada)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()
In [131]:
################################ ExtraTrees ################################
######################################################################################
In [29]:
Rank= [True,True,True,True,False,False,False,False,True,True,True,False
,True,True,True,True,False,False,False,True,True,True,True,True
,True,True,True,True,False,False,True,True,True,True,True,True
,True,True,True,False,False,True,True,True,True,False,False,True
,True,True,True,False,False,False,True,True,True,False,False,True
,True,True,True,False,False,False,True,True,True,False,False,True
,True,True,True,False,False,False,True,False,False,False,False,False
,False,False,False,False,False,False,False,True,False,False,False,False
,True,False,False,False,False,False,False,False,False,False,False,False
,False,False,False,False,False,True,False,False,False,False,False,False
,True,False,True,False,False,False,True,True,False,True,True,True
,True,True,True,True,True,True,True,False,True]


data={ 'Features':  X.columns ,'Rank': Rank}
rank=pd.DataFrame( data)
li=rank.loc[rank.Rank ]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size =0.3)
In [30]:
import time
start_time = time.time()
from sklearn.ensemble import ExtraTreesClassifier

extra= ExtraTreesClassifier( n_estimators=500, max_depth=30, min_samples_split=50, min_samples_leaf=25)


extra.fit(X_train, y_train)
print("--- %s seconds ---" % (time.time() - start_time))
--- 82.30454468727112 seconds ---
In [31]:
# Predict and get the accuracy score
from sklearn import metrics
predict_y_extra = extra.predict(X_test)
model_score_extra =  extra.score(X_test, y_test)

# calculate the fpr and tpr for all thresholds of the classification
probs_extra = extra.predict_proba(X_test)
preds_extra = probs_extra[:,1]
fp_extra, tp_extra, threshold_extra = metrics.roc_curve(y_test, preds_extra)
AUC_extra = metrics.auc(fp_extra, tp_extra)

#K-folds cross validaion
cv_score_extra = cross_validation.cross_val_score(extra, X_train , y_train, cv=3, scoring='roc_auc')   
In [32]:
cv_mean_extra=np.mean(cv_score_extra)
#Print model report:
print ("\nModel Report")
print ("Accuracy : %.3g" % model_score_extra )
print ("AUC Score (Test): %0.3f" % AUC_extra )
print ("CV Score : Mean - %.4g | Std - %.4g | Min - %.4g | Max - %.4g" % (np.mean(cv_score_extra),
                                                                          np.std(cv_score_extra),
                                                                          np.min(cv_score_extra),
                                                                          np.max(cv_score_extra)))    
import math 
y_hat= extra.predict(X_test)
RSS_extra = ((y_hat - y_test) ** 2).sum()
k=X.shape[1]
AIC_extra= 2*k - 2*math.log(RSS_extra )
print( "Akaike information criterion: " + str(AIC_extra ))
print('somme  erreur carré ' +str(RSS_extra))

confusion_matrix_extra = metrics.confusion_matrix(y_test, predict_y_extra)
TN= confusion_matrix_extra[0, 0]                
FP=confusion_matrix_extra[0, 1]
FN=confusion_matrix_extra[1, 0]
TP=confusion_matrix_extra[1, 1] 
#false negative rate error type I 
fnr_extra=FN/(FN+ TP)
#False negaive rate error type II
fpr_extra=FP/(FP+TN)

print ("Erreur type I: FP False Positive rate: %.3g" % fnr_extra )   
print ("Erreur type II: FP false negative rate: %.3g" % fpr_extra )   


print ("\n ")
print ("Confusion Matrix: ")

print ("          Predicted")
print ("         |  0  |  1    |")
print ("         |------|------|")
print ("       0 | %3d | %3d |" % (confusion_matrix_extra[0, 0],
                                   confusion_matrix_extra[0, 1]))
print ("Actual   |------|------|")
print ("       1 | %3d | %3d |" % (confusion_matrix_extra[1, 0],
                                   confusion_matrix_extra[1, 1]))
print ("         |------|------|")
print ("\n ")
from sklearn.metrics import classification_report
print(classification_report(y_test, 
                            predict_y_extra, 
                            digits=3))
print('train ')
print(classification_report(y_train, 
                            extra.predict(X_train), 
                            digits=3))
Model Report
Accuracy : 0.726
AUC Score (Test): 0.790
CV Score : Mean - 0.7919 | Std - 0.002939 | Min - 0.7878 | Max - 0.7945
Akaike information criterion: 265.1090353122755
somme  erreur carré 4654
Erreur type I: FP False Positive rate: 0.31
Erreur type II: FP false negative rate: 0.237

 
Confusion Matrix: 
          Predicted
         |  0  |  1    |
         |------|------|
       0 | 6508 | 2027 |
Actual   |------|------|
       1 | 2627 | 5837 |
         |------|------|

 
             precision    recall  f1-score   support

          0      0.712     0.763     0.737      8535
          1      0.742     0.690     0.715      8464

avg / total      0.727     0.726     0.726     16999

train 
             precision    recall  f1-score   support

          0      0.727     0.785     0.755     19796
          1      0.767     0.706     0.735     19867

avg / total      0.747     0.745     0.745     39663

In [33]:
# method I: plt
import matplotlib.pyplot as plt
plt.title('Receiver Operating Characteristic')
plt.plot(fp_extra, tp_extra, 'b', label = 'AUC = %0.3f' % AUC_extra)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()
In [ ]:
####################### bagging clasifier #######################
In [34]:
X = pd.concat([num, col_categoric_dummies],axis=1)
Rank= [True,True,True,True,False,False,False,False,True,True,True,True
        ,True,False,True,True,True,True,True,True,True,True,True,True
        ,True,True,True,True,True,True,True,True,True,True,True,True
        ,True,True,True,True,True,True,True,True,True,True,True,False
        ,True,True,True,True,False,True,True,True,True,True,True,False
        ,False,True,True,True,True,True,True,True,True,True,True,False
        ,False,True,True,True,False,True,True,False,False,False,False,False
        ,False,False,False,False,False,False,False,True,False,False,False,False
        ,False,False,False,False,False,False,False,False,False,False,False,False
        ,False,False,False,False,False,True,False,False,False,False,False,False
        ,False,False,False,False,False,False,False,True,False,False,False,False
        ,False,False,False,False,False,False,False,False,False]


data={ 'Features':  X.columns ,'Rank': Rank}
rank=pd.DataFrame( data)
li=rank.loc[rank.Rank ]
selected_features=li.Features
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X[selected_features], y, test_size =0.3 ,  stratify=y)
len(selected_features)
from sklearn.ensemble import BaggingClassifier
In [35]:
import time
start_time = time.time()
from sklearn.ensemble import BaggingClassifier


bagging = BaggingClassifier( n_estimators=400,max_samples= 0.2,bootstrap=False)
bagging.fit(X_train, y_train)
print("--- %s seconds ---" % (time.time() - start_time))

# Predict and get the accuracy score
from sklearn import metrics
predict_y_bagging = bagging.predict(X_test)
model_score_bagging =  bagging.score(X_test, y_test)

# calculate the fpr and tpr for all thresholds of the classification
probs_bagging = bagging.predict_proba(X_test)
preds_bagging = probs_bagging[:,1]
fp_bagging, tp_bagging, threshold_bagging = metrics.roc_curve(y_test, preds_bagging)
AUC_bagging = metrics.auc(fp_bagging, tp_bagging)

#K-folds cross validaion
cv_score_bagging = cross_validation.cross_val_score(bagging, X_train , y_train, cv=3, scoring='roc_auc')   


cv_mean_bagging=np.mean(cv_score_bagging)
#Print model report:
print ("\nModel Report")
print ("Accuracy : %.3g" % model_score_bagging )
print ("AUC Score (Test): %0.3f" % AUC_bagging )
print ("CV Score : Mean - %.4g | Std - %.4g | Min - %.4g | Max - %.4g" % (np.mean(cv_score_bagging),
                                                                          np.std(cv_score_bagging),
                                                                          np.min(cv_score_bagging),
                                                                          np.max(cv_score_bagging)))    
confusion_matrix_bagging = metrics.confusion_matrix(y_test, predict_y_bagging)
import math 
RSS_bagging = ((predict_y_bagging - y_test) ** 2).sum()
k=X.shape[1]
AIC_bagging= 2*k - 2*math.log(RSS_bagging )
print( "Akaike information criterion: " + str(AIC_bagging ))
print('somme  erreur carré ' +str(RSS_bagging))

TN= confusion_matrix_bagging[0, 0]                
FP=confusion_matrix_bagging[0, 1]
FN=confusion_matrix_bagging[1, 0]
TP=confusion_matrix_bagging[1, 1] 
#false negative rate error type I 
fnr_bagging=FN/(FN+ TP)
#False negaive rate error type II
fpr_bagging=FP/(FP+TN)

print ("Erreur type I: FP False Positive rate: %.3g" % fnr_bagging )   
print ("Erreur type II: FP false negative rate: %.3g" % fpr_bagging )   



print ("\n ")
print ("Confusion Matrix: ")

print ("          Predicted")
print ("         |  0  |  1  |")
print ("         |-----|-----|")
print ("       0 | %3d | %3d |" % (confusion_matrix_bagging[0, 0],
                                   confusion_matrix_bagging[0, 1]))
print ("Actual   |-----|-----|")
print ("       1 | %3d | %3d |" % (confusion_matrix_bagging[1, 0],
                                   confusion_matrix_bagging[1, 1]))
print ("         |-----|-----|")
print ("\n ")
from sklearn.metrics import classification_report
print(classification_report(y_test, 
                            predict_y_bagging, 
                            digits=3))
print("Train !")
print(classification_report(y_train, 
                             bagging.predict(X_train), 
                            digits=3))
--- 231.37771654129028 seconds ---

Model Report
Accuracy : 0.777
AUC Score (Test): 0.855
CV Score : Mean - 0.8389 | Std - 0.003597 | Min - 0.8352 | Max - 0.8438
Akaike information criterion: 265.5202851779628
somme  erreur carré 3789
Erreur type I: FP False Positive rate: 0.255
Erreur type II: FP false negative rate: 0.19

 
Confusion Matrix: 
          Predicted
         |  0  |  1  |
         |-----|-----|
       0 | 6882 | 1618 |
Actual   |-----|-----|
       1 | 2171 | 6328 |
         |-----|-----|

 
             precision    recall  f1-score   support

          0      0.760     0.810     0.784      8500
          1      0.796     0.745     0.770      8499

avg / total      0.778     0.777     0.777     16999

Train !
             precision    recall  f1-score   support

          0      0.858     0.895     0.876     19831
          1      0.890     0.852     0.871     19832

avg / total      0.874     0.874     0.873     39663

In [50]:
table=pd.DataFrame(columns=['Model','AUC','Accuracy'   ,'AIC','somme  erreur carré','CV AUC mean','Error I','Error II'])
table.loc[0]=['Random Forest',AUC_random_forest, model_score_random_forest,  AIC_random_forest, RSS_random_forest, cv_mean_random_forest ,fnr_random_forest, fpr_random_forest]
table.loc[0]=['Bagging Tree',AUC_bagging, model_score_bagging,  AIC_bagging, RSS_bagging, cv_mean_bagging ,fnr_bagging, fpr_bagging]
table.loc[1]=['Multilayer Perceptron',AUC_mlp,  model_score_mlp,    AIC_mlp,RSS_mlp, cv_mean_mlp , fnr_mlp, fpr_mlp]
table.loc[2]=['logistic Regression', AUC_logreg , model_score_logreg,   AIC_logreg, RSS_logreg, cv_mean_logreg ,fnr_logreg, fpr_logreg]
table.loc[3]=['Gradient Tree Boosting',AUC_gbm,  model_score_gbm,   AIC_gbm,RSS_gbm, cv_mean_gbm , fnr_gbm, fpr_gbm]
table.loc[4]=['Naive Bayes ', AUC_bays, model_score_bays,   AIC_bays,RSS_bays, cv_mean_bays , fnr_bays, fpr_bays]
table.loc[6]=['Decision trees', AUC_DT, model_score_DT,  AIC_DT,RSS_DT, cv_mean_DT , fnr_DT, fpr_DT]
table.loc[5]=['Adaptive Boosting', AUC_ada, model_score_ada,    AIC_ada, RSS_ada , cv_mean_ada , fnr_ada, fpr_ada]
table.loc[7]=['Extra Tree',AUC_extra,  model_score_extra,   AIC_extra, RSS_extra , cv_mean_extra , fnr_extra, fpr_extra]
#table.loc[8]=['svm', AUC_svm, model_score_svm,  AUC_svm,  AIC_svm, RSS_svm , 0 , fnr_svm, fpr_svm]

table
Out[50]:
Model AUC Accuracy AIC somme erreur carré CV AUC mean Error I Error II
0 Bagging Tree 0.854589 0.777105 265.520285 3789 0.838925 0.255442 0.190353
1 Multilayer Perceptron 0.797060 0.726866 265.113768 4643 0.797837 0.297380 0.248822
2 logistic Regression 0.723487 0.656392 122.654685 5841 0.721755 0.401193 0.285308
3 Gradient Tree Boosting 0.853585 0.773516 265.488343 3850 0.849777 0.245676 0.207294
4 Naive Bayes 0.725808 0.663510 264.696552 5720 0.734419 0.407720 0.266581
6 Decision trees 0.775514 0.718395 265.052682 4787 0.774413 0.329785 0.232872
5 Adaptive Boosting 0.827701 0.750279 265.293006 4245 0.831466 0.272406 0.227139
7 Extra Tree 0.789825 0.726219 265.109035 4654 0.791939 0.310373 0.237493
In [177]:
  AIC_random_forest
    
Out[177]:
235.20859341234294
In [51]:
# method I: plt
import matplotlib.pyplot as plt
fig = plt.figure(figsize=(10,4), dpi=1600) 
plt.title('Courbe du ROC')
plt.plot(fp_extra, tp_extra, 'b', label = 'Extra Tree = %0.3f' % AUC_extra,color='c')
plt.plot(fp_random_forest, tp_random_forest, 'b', label = 'Random forest = %0.3f' % AUC_random_forest,color='silver')
plt.plot(fp_bagging, tp_bagging, 'b', label = 'Bagging Tree = %0.3f' % AUC_bagging,color='blue')

plt.plot(fp_mlp, tp_mlp, 'b', label = 'mlp = %0.3f' % AUC_mlp,color='yellow')
plt.plot(fp_logreg, tp_logreg, 'b', label = 'logistic Reg = %0.3f' % AUC_logreg ,color='green')
plt.plot(fp_gbm, tp_gbm, 'b', label = 'Gradient B = %0.3f' % AUC_gbm, color='deeppink')
plt.plot(fp_bays, tp_bays, 'b', label = ' Naive Bays = %0.3f' % AUC_bays, color='pink')
plt.plot(fp_DT, tp_DT, 'b', label = 'Decision Tree = %0.3f' % AUC_DT, color='gray')
plt.plot(fp_ada, tp_ada, 'b', label = 'AdaBoost = %0.3f' % AUC_ada, color='orange')
#plt.plot(fpr_svm, tpr_svm, 'b', label = 'AUC svm = %0.3f' % AUC_svm, color='lightcoral')

plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()
In [81]:
# AUC 	Score 	AIC 	somme erreur carré 	CV AUC mean 	Error I
fig = plt.figure(figsize=(15,6), dpi=1000) 
# libraries
import numpy as np
import matplotlib.pyplot as plt

# set width of bar
barWidth = 0.15

# set height of bar
#bars1 = table['Score']
bars1 = table['CV AUC mean']
bars2 = table['Accuracy']
bars3 = table['Error I']
bars4 = table['Error II']
#bars4 = table['AUC']

# Set position of bar on X axis
r1 = np.arange(len(bars1))
r2 = [x + barWidth for x in r1]
r3 = [x + barWidth for x in r2]
r4 = [x + barWidth for x in r3]
#r5 = [x + barWidth for x in r4]
#r6 = [x + barWidth for x in r5]

# Make the plot
#plt.bar(r1, bars1, color='#FA8072', width=barWidth, edgecolor='white', label='Score')
plt.bar(r1, bars1, color='#2BDFBB', width=barWidth, edgecolor='white', label='CV AUC mean')
plt.bar(r2, bars2, color='#ffff66', width=barWidth, edgecolor='white', label='Score')
plt.bar(r3, bars3, color='#FF1493', width=barWidth, edgecolor='white', label='Error I')
plt.bar(r4, bars4, color='#FFCBA4', width=barWidth, edgecolor='white', label='Error II')
#plt.bar(r4, bars4, color='#008080', width=barWidth, edgecolor='white', label='AUC')

# Add xticks on the middle of the group bars
plt.xlabel('Compare models', fontweight='bold')
plt.xticks([r + barWidth for r in range(len(bars1))], ['Random Forest', 'Multilayer Perceptron', 
                                                       'logistic Regression', 'Gradient Tree Boosting',
                                                       'Naive Bayes','Decision trees',
                                                       'Adaptive Boosting','Extra Tree','svm'], rotation=60)

for i in range (0,8):
    plt.text(i,.7, round(bars1[i], 2),ma= 'center', ha= 'center',  rotation=90, va='top' , size =  'large' )
    plt.text(0.2+i,.7, round(bars2[i], 2),ma= 'center', ha= 'center',  rotation=90, va='top' , size =  'large')
    plt.text(0.3+i,.3, round(bars3[i], 2),ma= 'center', ha= 'center',  rotation=90, va='top' )
    plt.text(0.5+i,.3, round(bars4[i], 2),ma= 'center', ha= 'center',  rotation=90, va='top' )


# Create legend & Show graphic
plt.legend(loc= 'best')
plt.show()
In [167]:
bars2
Out[167]:
0    0.839461
1    0.818553
2    0.798163
3    0.840060
4    0.753577
6    0.692800
5    0.833625
7    0.800596
Name: AUC, dtype: float64